1   package org.apache.solr.search;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import org.apache.solr.SolrTestCaseJ4;
21  import org.junit.BeforeClass;
22  import org.junit.Test;
23  
24  public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
25  
26    public String getCoreName() {
27      return "basic";
28    }
29  
30    @BeforeClass
31    public static void beforeTests() throws Exception {
32      initCore("solrconfig-basic.xml", "schema-folding.xml");
33  
34      String docs[] = {
35          "abcdefg1 finger",
36          "gangs hijklmn1",
37          "opqrstu1 zilly",
38      };
39  
40      // prepare the index
41      for (int i = 0; i < docs.length; i++) {
42        String num = Integer.toString(i);
43        String boolVal = ((i % 2) == 0) ? "true" : "false";
44        assertU(adoc("id", num,
45            "int_f", num,
46            "float_f", num,
47            "long_f", num,
48            "double_f", num,
49            "bool_f", boolVal,
50            "date_f", "200" + Integer.toString(i % 10) + "-01-01T00:00:00Z",
51            "content", docs[i],
52            "content_ws", docs[i],
53            "content_rev", docs[i],
54            "content_multi", docs[i],
55            "content_lower_token", docs[i],
56            "content_oldstyle", docs[i],
57            "content_charfilter", docs[i],
58            "content_multi_bad", docs[i],
59            "content_straight", docs[i],
60            "content_lower", docs[i],
61            "content_folding", docs[i],
62            "content_stemming", docs[i],
63            "content_keyword", docs[i]
64        ));
65      }
66      // Mixing and matching amongst various languages is probalby a bad thing, so add some tests for various
67      // special filters
68      int idx = docs.length;
69      // Greek
70      assertU(adoc("id", Integer.toString(idx++), "content_greek", "Μάϊος"));
71      assertU(adoc("id", Integer.toString(idx++), "content_greek", "ΜΆΪΟΣ"));
72  
73      // Turkish
74  
75      assertU(adoc("id", Integer.toString(idx++), "content_turkish", "\u0130STANBUL"));
76      assertU(adoc("id", Integer.toString(idx++), "content_turkish", "ISPARTA"));
77      assertU(adoc("id", Integer.toString(idx++), "content_turkish", "izmir"));
78  
79  
80      // Russian normalization
81      assertU(adoc("id", Integer.toString(idx++), "content_russian", "электромагнитной"));
82      assertU(adoc("id", Integer.toString(idx++), "content_russian", "Вместе"));
83      assertU(adoc("id", Integer.toString(idx++), "content_russian", "силе"));
84  
85      // persian normalization
86      assertU(adoc("id", Integer.toString(idx++), "content_persian", "هاي"));
87      
88      // arabic normalization
89      assertU(adoc("id", Integer.toString(idx++), "content_arabic", "روبرت"));
90  
91      // hindi normalization
92      assertU(adoc("id", Integer.toString(idx++), "content_hindi", "हिंदी"));
93      assertU(adoc("id", Integer.toString(idx++), "content_hindi", "अाअा"));
94      
95      // german normalization
96      assertU(adoc("id", Integer.toString(idx++), "content_german", "weissbier"));
97      
98      // cjk width normalization
99      assertU(adoc("id", Integer.toString(idx++), "content_width", "ヴィッツ"));
100     assertU(commit());
101   }
102 
103   @Test
104   public void testPrefixCaseAccentFolding() throws Exception {
105     String matchOneDocPrefixUpper[][] = {
106         {"A*", "ÁB*", "ABÇ*"},   // these should find only doc 0
107         {"H*", "HÏ*", "HìJ*"},   // these should find only doc 1
108         {"O*", "ÖP*", "OPQ*"},   // these should find only doc 2
109     };
110 
111     String matchRevPrefixUpper[][] = {
112         {"*Ğ1", "*DEfG1", "*EfG1"},
113         {"*N1", "*LmŊ1", "*MÑ1"},
114         {"*Ǖ1", "*sTu1", "*RŠTU1"}
115     };
116 
117     // test the prefix queries find only one doc where the query is uppercased. Must go through query parser here!
118     for (int idx = 0; idx < matchOneDocPrefixUpper.length; idx++) {
119       for (int jdx = 0; jdx < matchOneDocPrefixUpper[idx].length; jdx++) {
120         String me = matchOneDocPrefixUpper[idx][jdx];
121         assertQ(req("q", "content:" + me),
122             "//*[@numFound='1']",
123             "//*[@name='id'][.='" + Integer.toString(idx) + "']");
124         assertQ(req("q", "content_ws:" + me),
125             "//*[@numFound='1']",
126             "//*[@name='id'][.='" + Integer.toString(idx) + "']");
127         assertQ(req("q", "content_multi:" + me),
128             "//*[@numFound='1']",
129             "//*[@name='id'][.='" + Integer.toString(idx) + "']");
130         assertQ(req("q", "content_lower_token:" + me),
131             "//result[@numFound='1']",
132             "//*[@name='id'][.='" + Integer.toString(idx) + "']");
133         assertQ(req("q", "content_oldstyle:" + me),
134             "//result[@numFound='0']");
135       }
136     }
137     for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
138       for (int jdx = 0; jdx < matchRevPrefixUpper[idx].length; jdx++) {
139         String me = matchRevPrefixUpper[idx][jdx];
140         assertQ(req("q", "content_rev:" + me),
141             "//*[@numFound='1']",
142             "//*[@name='id'][.='" + Integer.toString(idx) + "']");
143       }
144     }
145   }
146 
147   // test the wildcard queries find only one doc  where the query is uppercased and/or accented.
148   @Test
149   public void testWildcardCaseAccentFolding() throws Exception {
150     String matchOneDocWildUpper[][] = {
151         {"Á*C*", "ÁB*1", "ABÇ*g1", "Á*FG1"},      // these should find only doc 0
152         {"H*k*", "HÏ*l?*", "HìJ*n*", "HìJ*m*"},   // these should find only doc 1
153         {"O*ř*", "ÖP*ş???", "OPQ*S?Ů*", "ÖP*1"},  // these should find only doc 2
154     };
155 
156     for (int idx = 0; idx < matchOneDocWildUpper.length; idx++) {
157       for (int jdx = 0; jdx < matchOneDocWildUpper[idx].length; jdx++) {
158         String me = matchOneDocWildUpper[idx][jdx];
159         assertQ("Error with " + me, req("q", "content:" + me),
160             "//result[@numFound='1']",
161             "//*[@name='id'][.='" + Integer.toString(idx) + "']");
162         assertQ(req("q", "content_ws:" + me),
163             "//result[@numFound='1']",
164             "//*[@name='id'][.='" + Integer.toString(idx) + "']");
165         assertQ(req("q", "content_multi:" + me),
166             "//result[@numFound='1']",
167             "//*[@name='id'][.='" + Integer.toString(idx) + "']");
168         assertQ(req("q", "content_oldstyle:" + me),
169             "//result[@numFound='0']");
170       }
171     }
172   }
173 
174   @Test
175   public void testLowerTokenizer() {
176     // The lowercasetokenizer will remove the '1' from the index, but not from the query, thus the special test.
177     assertQ(req("q", "content_lower_token:Á*C*"), "//result[@numFound='1']");
178     assertQ(req("q", "content_lower_token:Á*C*1"), "//result[@numFound='0']");
179     assertQ(req("q", "content_lower_token:h*1"), "//result[@numFound='0']");
180     assertQ(req("q", "content_lower_token:H*1"), "//result[@numFound='0']");
181     assertQ(req("q", "content_lower_token:*1"), "//result[@numFound='0']");
182     assertQ(req("q", "content_lower_token:HÏ*l?*"), "//result[@numFound='1']");
183     assertQ(req("q", "content_lower_token:hȉ*l?*"), "//result[@numFound='1']");
184   }
185 
186   @Test
187   public void testFuzzy() throws Exception {
188     assertQ(req("q", "content:ZiLLx~1"),
189             "//result[@numFound='1']");
190     assertQ(req("q", "content_straight:ZiLLx~1"),      // case preserving field shouldn't match
191            "//result[@numFound='0']");
192     assertQ(req("q", "content_folding:ZiLLx~1"),       // case preserving field shouldn't match
193            "//result[@numFound='0']");
194   }
195 
196   @Test
197   public void testRegex() throws Exception {
198     assertQ(req("q", "content:/Zill[a-z]/"),
199         "//result[@numFound='1']");
200     assertQ(req("q", "content:/Zill[A-Z]/"),   // everything in the regex gets lowercased?
201         "//result[@numFound='1']");
202     assertQ(req("q", "content_keyword:/.*Zill[A-Z]/"),
203         "//result[@numFound='1']");
204 
205     assertQ(req("q", "content_straight:/Zill[a-z]/"),      // case preserving field shouldn't match
206         "//result[@numFound='0']");
207     assertQ(req("q", "content_folding:/Zill[a-z]/"),       // case preserving field shouldn't match
208         "//result[@numFound='0']");
209 
210     assertQ(req("q", "content_keyword:/Abcdefg1 Finger/"), // test spaces
211         "//result[@numFound='1']");
212 
213   }
214 
215 
216 
217   @Test
218   public void testGeneral() throws Exception {
219     assertQ(req("q", "content_stemming:fings*"), "//result[@numFound='0']"); // should not match (but would if fings* was stemmed to fing*
220     assertQ(req("q", "content_stemming:fing*"), "//result[@numFound='1']");
221   }
222 
223   // Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
224   // and update the documentation
225   @Test
226   public void testPhrase() {
227     assertQ(req("q", "content:\"silly ABCD*\""),
228         "//result[@numFound='0']");
229   }
230 
231   @Test
232   public void testWildcardRange() {
233     assertQ(req("q", "content:[* TO *]"),
234         "//result[@numFound='3']");
235     assertQ(req("q", "content:[AB* TO Z*]"),
236         "//result[@numFound='3']");
237     assertQ(req("q", "content:[AB*E?G* TO TU*W]"),
238         "//result[@numFound='3']");
239   }
240 
241 
242   // Does the char filter get correctly handled?
243   @Test
244   public void testCharFilter() {
245     assertQ(req("q", "content_charfilter:" + "Á*C*"),
246         "//result[@numFound='1']",
247         "//*[@name='id'][.='0']");
248     assertQ(req("q", "content_charfilter:" + "ABÇ*g1"),
249         "//result[@numFound='1']",
250         "//*[@name='id'][.='0']");
251     assertQ(req("q", "content_charfilter:" + "HÏ*l?*"),
252         "//result[@numFound='1']",
253         "//*[@name='id'][.='1']");
254   }
255 
256   @Test
257   public void testRangeQuery() {
258     assertQ(req("q", "content:" + "{Ȫp*1 TO QŮ*}"),
259         "//result[@numFound='1']",
260         "//*[@name='id'][.='2']");
261 
262     assertQ(req("q", "content:" + "[Áb* TO f?Ñg?r]"),
263         "//result[@numFound='1']",
264         "//*[@name='id'][.='0']");
265 
266   }
267 
268   @Test
269   public void testNonTextTypes() {
270     String[] intTypes = {"int_f", "float_f", "long_f", "double_f"};
271 
272     for (String str : intTypes) {
273       assertQ(req("q", str + ":" + "0"),
274           "//result[@numFound='1']",
275           "//*[@name='id'][.='0']");
276 
277       assertQ(req("q", str + ":" + "[0 TO 2]"),
278           "//result[@numFound='3']",
279           "//*[@name='id'][.='0']",
280           "//*[@name='id'][.='1']",
281           "//*[@name='id'][.='2']");
282     }
283     assertQ(req("q", "bool_f:true"),
284         "//result[@numFound='2']",
285         "//*[@name='id'][.='0']",
286         "//*[@name='id'][.='2']");
287 
288     assertQ(req("q", "bool_f:[false TO true]"),
289         "//result[@numFound='3']",
290         "//*[@name='id'][.='0']",
291         "//*[@name='id'][.='1']",
292         "//*[@name='id'][.='2']");
293 
294     assertQ(req("q", "date_f:2000-01-01T00\\:00\\:00Z"),
295         "//result[@numFound='1']",
296         "//*[@name='id'][.='0']");
297 
298     assertQ(req("q", "date_f:[2000-12-31T23:59:59.999Z TO 2002-01-02T00:00:01Z]"),
299         "//result[@numFound='2']",
300         "//*[@name='id'][.='1']",
301         "//*[@name='id'][.='2']");
302   }
303 
304   @Test
305   public void testMultiBad() {
306     try {
307       ignoreException("analyzer returned too many terms");
308       assertQ(req("q", "content_multi_bad:" + "abCD*"));
309       fail("Should throw exception when token evaluates to more than one term");
310     } catch (Exception expected) {
311       assertTrue(expected.getCause() instanceof org.apache.solr.common.SolrException);
312     } finally {
313       resetExceptionIgnores();
314     }
315   }
316   @Test
317   public void testGreek() {
318     assertQ(req("q", "content_greek:μαιο*"), "//result[@numFound='2']");
319     assertQ(req("q", "content_greek:ΜΆΪΟ*"), "//result[@numFound='2']");
320     assertQ(req("q", "content_greek:Μάϊο*"), "//result[@numFound='2']");
321   }
322   @Test
323   public void testRussian() {
324     assertQ(req("q", "content_russian:элЕктРомагн*тной"), "//result[@numFound='1']");
325     assertQ(req("q", "content_russian:Вме*те"), "//result[@numFound='1']");
326     assertQ(req("q", "content_russian:Си*е"), "//result[@numFound='1']");
327     assertQ(req("q", "content_russian:эЛектромагнИт*"), "//result[@numFound='1']");
328   }
329   
330   public void testPersian() {
331     assertQ(req("q", "content_persian:های*"), "//result[@numFound='1']");
332   }
333   
334   public void testArabic() {
335     assertQ(req("q", "content_arabic:روبرـــــــــــــــــــــــــــــــــت*"), "//result[@numFound='1']");
336   }
337   
338   public void testHindi() {
339     assertQ(req("q", "content_hindi:हिन्दी*"), "//result[@numFound='1']");
340     assertQ(req("q", "content_hindi:आआ*"), "//result[@numFound='1']");
341   }
342   
343   public void testGerman() {
344     assertQ(req("q", "content_german:weiß*"), "//result[@numFound='1']");
345   }
346   
347   public void testCJKWidth() {
348     assertQ(req("q", "content_width:ヴィ*"), "//result[@numFound='1']");
349   }
350 }